Feature Selection, Model Selection and Tuning
Machine Learning and Artificial Intelligence Course - Texas University
RAFAEL SOARES DE CARVALHO
Objective:
To predict the concrete strength using the data available in file concrete_data.xls. Apply feature engineering and model tuning to obtain 80% to 95% of R2score.
Task 1 - Exploratory data quality report
1. Univariate analysis
2. Multivariate analysis
3. Feature Engineering
#Import Libraries
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score,accuracy_score,roc_curve
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import xgboost as xgb
import statsmodels.api as sm
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy.stats as stats
from sklearn.tree import DecisionTreeClassifier
from IPython.display import Image
from sklearn import tree
from os import system
import multiprocessing
from itertools import product
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.model_selection import GridSearchCV
from sklearn import model_selection
# Remove scientific notations and display numbers with 2 decimal points instead
pd.options.display.float_format = '{:,.2f}'.format
#pip install graphviz
#conda install -c conda-forge mlxtend
#Import Dataset
original_dataset = pd.read_csv("concrete.csv") # data as it is - to check consistency
data = pd.read_csv("concrete.csv") # data to be manipulated
#Check import consistency
data.shape
#Review: Import Validated 1.03k regs and 9 information collumns
#top10 view
data.head(20)
#top 10 rows for data exploration
# Numerical data, named collumns, lot of "zeros" information, most double numbers.
#Check Data Types
data.dtypes
# No need to change data types for now
#Statistical information and summary
data.describe()
#Check Data Infos
data.info()
# there are not nulls values - float64(8 collums), int64(1 collumn)
#Check Duplicated Rows
dup = data.duplicated()
sum(dup)
#There are 25 duplicated
#remove Duplicated Rows
data_work = original_dataset.copy()
data_work['dup'] = dup
excluded_dups = data_work.loc[(data_work['dup'] == True)] #to control removed data
data.drop_duplicates(keep='first', inplace=False)
print('Rows in Original dataset',original_dataset.shape[0])
print('Rows in Duplicated rows',excluded_dups.shape[0])
print('Rows in Adjusted dataset', data.shape[0])
# Create a collum Months for Age understand
data['age_months'] = np.floor(data.age/30)
# Create a collumn for quantile the strength to better view of data
data['strength_quantile'] = pd.qcut(data['strength'],4, labels=False)
# Create a collumn to Age category
data.loc[data['age'] >= 90, 'age_category'] = 90 #90 or more
data.loc[data['age'] < 90, 'age_category'] = data['age']
# Create a collumn to ciment mix category
data.loc[data['superplastic'] == 0, 'contains_superplastic'] = 0
data.loc[data['superplastic'] > 0, 'contains_superplastic'] = 1
data.loc[data['ash'] == 0, 'contains_ash'] = 0
data.loc[data['ash'] > 0, 'contains_ash'] = 1
data.loc[data['slag'] == 0, 'contains_slag'] = 0
data.loc[data['slag'] > 0, 'contains_slag'] = 1
#SLAG
details = data.loc[(data['slag'] > 0)]
plt.figure(figsize=(20,5))
sns.boxplot(details['slag'])
plt.show()
data.loc[data['slag'] > 315, 'slag_adjusted'] = details['slag'].mean()
data.loc[data['slag'] <= 315, 'slag_adjusted'] = data['slag']
data
plt.figure(figsize=(20,5))
sns.boxplot(data['slag_adjusted'])
plt.show()
#WATER
details = data.loc[(data['water'] > 0)]
plt.figure(figsize=(20,5))
sns.boxplot(details['water'])
plt.show()
data.loc[data['water'] < 125, 'water_adjusted'] = details['water'].mean()
data.loc[data['water'] >= 125, 'water_adjusted'] = data['water']
data.loc[data['water'] > 230, 'water_adjusted'] = details['water'].mean()
#data
plt.figure(figsize=(20,5))
sns.boxplot(data['water_adjusted'])
plt.show()
#superplastic
details = data.loc[(data['superplastic'] > 0)]
plt.figure(figsize=(20,5))
sns.boxplot(details['superplastic'])
plt.show()
data.loc[data['superplastic'] >= 17, 'superplastic_adjusted'] = details['superplastic'].mean()
data.loc[data['superplastic'] < 17, 'superplastic_adjusted'] = data['superplastic']
#data.loc[data['superplastic'] > 230, 'superplastic_adjusted'] = details['superplastic'].mean()
#data
plt.figure(figsize=(20,5))
sns.boxplot(data['superplastic_adjusted'])
plt.show()
#fineagg
details = data.loc[(data['fineagg'] > 0)]
plt.figure(figsize=(20,5))
sns.boxplot(details['fineagg'])
plt.show()
data.loc[data['fineagg'] >= 920, 'fineagg_adjusted'] = details['fineagg'].mean()
data.loc[data['fineagg'] < 920, 'fineagg_adjusted'] = data['fineagg']
data.loc[data['fineagg'] <= 670, 'fineagg_adjusted'] = details['fineagg'].mean()
#data.loc[data['superplastic'] > 230, 'superplastic_adjusted'] = details['superplastic'].mean()
#data
plt.figure(figsize=(20,5))
sns.boxplot(data['fineagg_adjusted'])
plt.show()
#Check for Nulls
nulls = data.isnull().any()
nulls
#Review: no missing values
#bank_adj.isnull().sum()
#IQR from each collumm
data.quantile(0.75) - data.quantile(0.25)
#Standard Deviation
print(data.std())
#Covariance
cov_data = (data.cov())
cov_data
data_attr = data.iloc[:, 0:9]
sns.pairplot(data_attr, diag_kind='kde')
data.corr()
#skewness of data
data.skew()
# simetric >-0.5 and <0.5
# moderate >-1 and <1
# highly skewed < -1 or > 1
# Distplot for each collumn
for x, y in enumerate(data):
plt.figure(figsize=(20,5))
sns.distplot(data[y],fit=stats.gamma,kde=False)
plt.show()
# Boxplot for each collumn
for x, y in enumerate(data):
plt.figure(figsize=(20,5))
sns.boxplot(data[y])
plt.show()
RESUME - Data Analysis and insights :
Data Analysis- Collumn by Collumn Analysis
We have 1.030 rows in the Dataset, with 25 duplicated rows. They were delleted from DataSet.
The original data are numerical. Most of data are double / float64.
There is 9 collumns and they are named as:
ï‚· Cement (cement) -- quantitative -- kg in a m3 mixture -- Input Variable
ï‚· Blast Furnace Slag (slag) -- quantitative -- kg in a m3 mixture -- Input Variable
ï‚· Fly Ash (ash) -- quantitative -- kg in a m3 mixture -- Input Variable
ï‚· Water (water) -- quantitative -- kg in a m3 mixture -- Input Variable
ï‚· Superplasticizer (superplastic) -- quantitative -- kg in a m3 mixture -- Input Variable
ï‚· Coarse Aggregate (coarseagg) -- quantitative -- kg in a m3 mixture -- Input Variable
ï‚· Fine Aggregate (fineagg) -- quantitative -- kg in a m3 mixture -- Input Variable
ï‚· Age(age) -- quantitative -- Day (1~365) -- Input Variable
ï‚· Concrete compressive strength(strength) -- quantitative -- MPa -- Output Variable
Taking a look in the statistics of Data, we can see:
There are no "null" values. But there are a lot of "Zeros" in: slag, superplastic and ash.
We can assume that Zeros in these columns is the same as "no addition" of the product in the cement mix.
Using this analyse, we can create a categorical variable to mark if it has or not slag, superplastic and ash.
Age is a variable from 0 to 365, but the most of numbers are fixed (there is no 1,2,3,.. but there is 1-3-7-14-28-56-+). So we can assume that we could use Age_Months as a categorical collumn. And we can transform data in a categorical data, using each of 1 to 56 + others (80/20 of data).
The Skewness of Data and actions:
Highly skewed:
age 3.27 - Transformed in Categorical data
superplastic 0.91 - Trasnformed in Categorical data - excluding zero we have better fit
slag 0.80 - Trasnformed in Categorical data - excluding zero we have better fit
ash 0.54 - Trasnformed in Categorical data - excluding zero we have better fit
cement 0.51
Symmetric:
water 0.07
coarseagg -0.04
fineagg -0.25
strength 0.42
Collumns "xxx_adjusted" created without outliers - superplastic, water, slag and fineagg.
ANALYSIS before the graphs, in this standard of information.
# Distplot for each collumn vs Quantile of Strength
q0 = data.loc[(data['strength_quantile'] == 0)]
q1 = data.loc[(data['strength_quantile'] == 1)]
q2 = data.loc[(data['strength_quantile'] == 2)]
q3 = data.loc[(data['strength_quantile'] == 3)]
for x, y in enumerate(data):
plt.figure(figsize=(20,5))
sns.distplot(q0[y]) #blue
sns.distplot(q1[y]) #orange
sns.distplot(q2[y]) #green
sns.distplot(q3[y]) #rose
plt.legend(labels=['q0','q1','q2','q3'])
plt.title(y + ' per quantile')
plt.show()
# Swarm for each collumn vs Quantile of Strength
for a, b in enumerate(data):
plt.figure(figsize=(20,5))
sns.swarmplot(y= data[b], x= data['strength_quantile'])
plt.title(b + ' per quantile')
plt.show()
# Categorical Variables
Cat_Vars = ['age_category','contains_superplastic','contains_ash','contains_slag']
#ax = sns.barplot(x="age_category", y="strength", data=data)
#ax.set(ylabel="Percent")
# Countplot
for x, y in enumerate(Cat_Vars):
plt.figure(figsize=(20,5))
sns.barplot(x= data[y], y="strength", data= data)
plt.show()
# Jointplot for each collumn vs Strength
for a, b in enumerate(data):
plt.figure(figsize=(30,10))
sns.jointplot(x='strength', y=b, data=data,kind="reg", height=9)
plt.suptitle(str.upper(b + ' per strength'), y=1, fontsize = 10)
plt.subplots_adjust(top=0.9)
plt.show()
# Graf view of correlation
def plot_correlation(data):
corr = data.corr()
fig, ax = plt.subplots(figsize=(30, 15))
ax.matshow(corr)
plt.xticks(range(len(corr.columns)), corr.columns)
plt.yticks(range(len(corr.columns)), corr.columns)
for (i, j), z in np.ndenumerate(corr):
ax.text(j, i, '{:0.1f}'.format(z), ha='center', va='center')
plot_correlation(data)
corr = data.corr()
print('\n Positive Correlation')
corr >= 0.4
print('\n Negative Correlation')
corr <= -0.4
Bivariate Data Analysis - Correlation Analysis
To better understand the data correlation from each collumn to target, i've made a quantile.
Seeing the density and swarm graphs, we can assume that:
We have more ciment on more strength
We have less water on more strength
We have less fineagg on more strength
We have less ash on more strength
Superplastic increases strength
Slag increases strength
Coarseagg reduces strength
Older mixes are strength than younger until 56 days. After this period it reduces strength
Superplastic and fineagg is negative correlated to water
- age_months : Created to group age from days to months
- strength_quantile: Quantile from Strength information to analysis
- age_category: Created from Days (int) to Categorical Variable
- contains_superplastic : Contains or not superplastic
- contains_ash : Contains or not ash
- contains_slag : Contains or not slag
#create a int variable for target
data['strength_categorical'] = pd.qcut(data['strength'],30, labels=False)
data['strength'] = data['strength'].astype('float64')
data.dtypes
data.groupby(by=['strength_categorical'])['strength'].agg(['mean', 'count','min','max']).sort_values(by='mean')
data.head(10)
#SCALE DATA
data_scaled = data.copy()
scaler = StandardScaler()
cols_to_scale = ['cement','slag','ash','water','superplastic','coarseagg','fineagg','age','strength','slag_adjusted','water_adjusted','superplastic_adjusted','fineagg_adjusted']
data_scaled[cols_to_scale] = scaler.fit_transform(data_scaled[cols_to_scale].to_numpy())
data_scaled.head()
X_scaled = data_scaled.drop(columns= ['strength','strength_quantile','strength_categorical']) # Separating the target and same variables from the others
y_scaled = data_scaled.strength
lab_enc = preprocessing.LabelEncoder()
y_scaled = lab_enc.fit_transform(y_scaled)
X = data.drop(columns= ['strength','strength_quantile','strength_categorical']) # Separating the target and same variables from the others
y = data.strength
lab_enc = preprocessing.LabelEncoder()
y = lab_enc.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=22)
print(X_train.shape, X_test.shape)
lr = LogisticRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
lr.score(X_test, y_test)
LR_Score_training = lr.score(X_train, y_train)
LR_Score_test = lr.score(X_test, y_test)
print('Training accuracy on Logistic Regression: %.3f' % LR_Score_training)
print('Testing accuracy on Logistic Regression: %.3f' % LR_Score_test)
#Cross validation
cv_LR = cross_val_score(lr, X, y, cv = 5).mean()
print(f'Cross validation score of Logistic Regression = %.3f'% cv_LR)
LinReg = LinearRegression()
LinReg.fit(X_train, y_train)
pred = LinReg.predict(X_test)
LinReg.score(X_test, y_test)
LinReg_Score_training = LinReg.score(X_train, y_train)
LinReg_Score_test = LinReg.score(X_test, y_test)
print('Training accuracy on Linear Regression: %.3f' % LinReg_Score_training)
print('Testing accuracy on Linear Regression: %.3f' % LinReg_Score_test)
# Build Linear Regression to feature selection
linR = LinearRegression()
# Build step forward feature selection
stepfw = sfs(linR, k_features=13, forward=True, scoring='r2', cv=5)
# Perform SFFS
stepfw = stepfw.fit(X_train, y_train)
stepfw.get_metric_dict()
fig = plot_sfs(stepfw.get_metric_dict())
plt.title('Sequential Forward Selection (w. R^2)')
plt.grid()
plt.show()
print('\n With 13 Fowards we have the best R^2 score')
# Best Features
columnList = list(X_train.columns)
feat_cols = list(stepfw.k_feature_idx_)
subsetColumnList = [columnList[i] for i in feat_cols]
print('The most relevant features: ',subsetColumnList)
linR = LinearRegression()
linR.fit(X_train[subsetColumnList], y_train)
y_train_pred = linR.predict(X_train[subsetColumnList])
y_test_pred = linR.predict(X_test[subsetColumnList])
linR_SFW_score_training = linR.score(X_train[subsetColumnList], y_train)
linR_SFW_score_test =linR.score(X_test[subsetColumnList], y_test)
print('Training accuracy on StepFoward selected features for Linear Regression: %.3f' % linR_SFW_score_training)
print('Testing accuracy on StepFoward selected features for Linear Regression: %.3f' % linR_SFW_score_test)
#Cross validation for Decision Tree dt
cv_linR = cross_val_score(linR, X, y, cv = 5).mean()
print(f'Cross validation score of Decision tree = %.3f'% cv_linR)
linR = LinearRegression()
linR.fit(X_train, y_train)
y_train_pred = linR.predict(X_train)
y_test_pred = linR.predict(X_test)
print('For comparison, Training accuracy on all features: %.3f' % linR.score(X_train, y_train))
print('For comparison, Testing accuracy on all features: %.3f' % linR.score(X_test, y_test))
# Kfold Cross Validation for Linear Regression with selected features
seed = 5
splits = 10
kfold = model_selection.KFold(n_splits=splits,random_state=seed)
model = LinearRegression()
results = model_selection.cross_val_score(model, X[subsetColumnList], y, cv=kfold, scoring='r2')
kfold_LinearRegressionSelected = results.mean()
print('Kfold Cross Validation Score for Linear Regression with Selected Features: %.3f' % kfold_LinearRegressionSelected)
#Cross validation
cv_LinReg_Selected = cross_val_score(model, X, y, cv = 5).mean()
print(f'Cross validation score of Linear Regression with Selected Features = %.3f'% cv_LinReg_Selected)
kfold = model_selection.KFold(n_splits=splits,random_state=seed)
model = LinearRegression()
results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring='r2')
kfold_LinearRegressionALL = results.mean()
print('\nKFold Cross Validation Score for Linear Regression with ALL Features: %.3f' % kfold_LinearRegressionALL)
#Cross validation
cv_LinReg_All = cross_val_score(model, X, y, cv = 5).mean()
print(f'Cross validation score of Linear Regression with all Features = %.3f'% cv_LinReg_All)
- Considering the number of parameters, complexity of data and meaningfull categorical variables,
its better to implement a complex model, not a linear one.
- Its better to use a quadratic or higher degree model because there are no linear Data in collumns.
Multiple densisties, weak correlations, skewed data, etc.
- Scores for this kind of model (linear) are high. May have opportunities to be incresed with tunning.
- The most relevant features: 'cement', 'slag', 'ash', 'superplastic', 'age', 'age_months', 'age_category', 'contains_superplastic', 'contains_ash', 'slag_adjusted', 'water_adjusted', 'fineagg_adjusted', 'contains_slag'.
- Using this set of features, model performance is increased by 0.01
- This model is stable, using Kfold cross validation.
#Build Decision Tree
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dtscore = dt.score(X_test, y_test)
pred = dt.predict(X_test)
DT_score_training = dt.score(X_train, y_train)
DT_score_test = dt.score(X_test, y_test)
print('Decision tree acccuracy score: %.3f' % DT_score_training)
print('Decision tree acccuracy score: %.3f' % DT_score_test)
#Cross validation for Decision Tree dt
cv_dt = cross_val_score(dt, X, y, cv = 5).mean()
print(f'Cross validation score of Decision tree = %.3f'% cv_dt)
# Kfold Cross Validation
seed = 5
splits = 10
kfold = model_selection.KFold(n_splits=splits,random_state=seed)
results = model_selection.cross_val_score(dt, X, y, cv=kfold, scoring='r2')
kfold_DT = results.mean()
print('Cross Validation Score for Decision Tree: %.3f' % kfold_DT)
rf = RandomForestClassifier(criterion = 'entropy', max_depth = 7, min_samples_leaf=5)
rf.fit(X_train, y_train)
rf_score_training = rf.score(X_train, y_train)
rf_score_test = rf.score(X_test, y_test)
print('Random Forest acccuracy score: %.3f' % rf_score_training)
print('Random Forest acccuracy score: %.3f' % rf_score_test)
#Cross validation Random Forest rf
cv_rf = cross_val_score(rf, X, y, cv = 5).mean()
print('Cross Validation Score of Random forest: %.3f' % cv_rf)
#Build Bagging Classifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=1)
dTreeR = DecisionTreeClassifier(criterion = 'gini', random_state=1, max_depth = 5)
dTreeR.fit(X_train, y_train)
bgcl = BaggingClassifier(base_estimator=dTreeR, n_estimators=100,random_state=1, max_samples= .3, bootstrap=True)
bgcl = bgcl.fit(X_train, y_train)
y_predict = bgcl.predict(X_test)
bgcl_score_training = bgcl.score(X_train, y_train)
bgcl_score_test =bgcl.score(X_test, y_test)
print('Training accuracy on Bagging Classifier: %.3f' % bgcl_score_training)
print('Testing accuracy on Bagging Classifier: %.3f' % bgcl_score_test)
#Cross validation
bgcl_cv = cross_val_score(bgcl, X, y, cv = 5).mean()
print('Cross Validation Score of Bagging Classifier: %.3f' % bgcl_cv)
#Build GradientBoosting Classifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=1)
gbcl = GradientBoostingClassifier(n_estimators=5,learning_rate = 0.05)
gbcl = gbcl.fit(X_train, y_train)
y_predict = gbcl.predict(X_test)
gbcl_score_training = gbcl.score(X_train, y_train)
gbcl_score_test =gbcl.score(X_test, y_test)
print('Training accuracy on GradientBoosting Classifier: %.3f' % gbcl_score_training)
print('Testing accuracy on GradientBoosting Classifier: %.3f' % gbcl_score_test)
#Cross validation
gbcl_cv = cross_val_score(gbcl, X, y, cv = 5).mean()
print('Cross Validation Score of GradientBoosting Classifier: %.3f' % gbcl_cv)
#Build AdaBoost Classifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=1)
abcl = AdaBoostClassifier(n_estimators=10, random_state=1)
abcl = bgcl.fit(X_train, y_train)
y_predict = abcl.predict(X_test)
abcl_score_training = bgcl.score(X_train, y_train)
abcl_score_test =bgcl.score(X_test, y_test)
print('Training accuracy on AdaBoost Classifier: %.3f' % abcl_score_training)
print('Testing accuracy on AdaBoost Classifier: %.3f' % abcl_score_test)
#Cross validation
abcl_cv = cross_val_score(abcl, X, y, cv = 5).mean()
print('Cross Validation Score of AdaBoost Classifier: %.3f' % abcl_cv)
#Creating a grid of hyperparameters
ada_boost = AdaBoostClassifier()
params = {'n_estimators': [100,200,300],
'learning_rate': [0.5,1.0],
'algorithm': ['SAMME','SAMME.R']
}
gs = GridSearchCV(estimator = ada_boost, param_grid = params, cv = 3, n_jobs = -1)
gs.fit(X_train, y_train)
gs.best_params_
resultDF = pd.DataFrame(gs.cv_results_['params'])
resultDF['mean_test_score'] = (gs.cv_results_['mean_test_score']*100.00)
resultDF.sort_values('mean_test_score',ascending = False)
# MODEL CREATED "WITH" GRIDSEARCH RESULTS
ada_gsparams = AdaBoostClassifier(algorithm= 'SAMME.R', learning_rate= 0.5, n_estimators= 300)
ada_gsparams.fit(X_train[subsetColumnList], y_train)
y_predict = ada_gsparams.predict(X_test[subsetColumnList])
ada_score_gs_test = ada_gsparams.score(X_test[subsetColumnList], y_test)
ada_score_gs_train = ada_gsparams.score(X_train[subsetColumnList], y_train)
print('Training accuracy on AdaBoosting with GridSearch HP: %.3f' % ada_score_gs_train)
print('Testing accuracy on AdaBoosting with GridSearch HP: %.3f' % ada_score_gs_test)
results_comparison = pd.DataFrame(y_predict)
results_comparison['test'] = y_test
#Cross validation
ada_gs_cv = cross_val_score(ada_gsparams, X, y, cv = 5).mean()
print('Cross Validation Score of AdaBoost Classifier with GridSearch Optimization: %.3f' % ada_gs_cv)
Resume = { 'Model': ['Logistic Regression','Linear Regression',
'Linear Regression w/ StepFoward','Decision Trees','Random Forest',
'Bagging', 'Gradient Boosting', 'AdaBoosting', 'AdaBoosting with GridSearch'],
'Training Score': [LR_Score_training, LinReg_Score_training,
linR_SFW_score_test, DT_score_training, rf_score_training,
bgcl_score_training, gbcl_score_training, abcl_score_training, ada_score_gs_train],
'Testing Score': [LR_Score_test,LinReg_Score_test,
linR_SFW_score_test,DT_score_test,rf_score_test,
bgcl_score_test,gbcl_score_test,abcl_score_test,ada_score_gs_test],
'Cross Validation Score': [cv_LR, cv_LinReg_All,
cv_linR, cv_dt,cv_rf,
bgcl_cv,gbcl_cv,abcl_cv,ada_gs_cv]
}
Resume = pd.DataFrame(Resume).sort_values(['Cross Validation Score','Testing Score','Training Score'], ascending = False)
Resume
WITH A LINEAR REGRESSION, WE CAN REACH MORE THAN 80% ACCURACY ON PREDICTION OF STRENGTH .
Machine Learning and Artificial Intelligence Course - Texas University